Initial Investigation and Cleaning

Let us begin with data loading.

setwd("~/PROJECTS/SD_WaterQuality")
df <- read.csv("beachwatch-sd.csv", stringsAsFactors = F)

Check if we have some columns which have only NA values, and if yes remove them. As you see below here were 22 of them.

no_na <- sapply(df, function(col) sum(!is.na(col))==0)
sum(no_na)
## [1] 22
df <- df[!no_na]

In addition we have constant columns. We can remove them, too. As result we are left with 16 variables. We can then look at what kind of variables we have.

constCols <- sapply(df, function(col) length(unique(col))==1)
sum(constCols)
## [1] 41
df <- df[!constCols]
summary(df)
##  stationname        stationcode         sampledate       
##  Length:202257      Length:202257      Length:202257     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  collectiontime       labbatch          methodname       
##  Length:202257      Length:202257      Length:202257     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    analyte              unit               result        
##  Length:202257      Length:202257      Min.   :     -10  
##  Class :character   Class :character   1st Qu.:       4  
##  Mode  :character   Mode  :character   Median :      20  
##                                        Mean   :   22700  
##                                        3rd Qu.:      30  
##                                        Max.   :28000000  
##                                        NA's   :3986      
##  resultqualcode        qacode          sampleagency       targetlatitude 
##  Length:202257      Length:202257      Length:202257      Min.   :32.53  
##  Class :character   Class :character   Class :character   1st Qu.:32.71  
##  Mode  :character   Mode  :character   Mode  :character   Median :32.79  
##                                                           Mean   :32.86  
##                                                           3rd Qu.:33.03  
##                                                           Max.   :33.39  
##                                                                          
##  targetlongitude   labagency         submittingagency  
##  Min.   :-117.6   Length:202257      Length:202257     
##  1st Qu.:-117.3   Class :character   Class :character  
##  Median :-117.3   Mode  :character   Mode  :character  
##  Mean   :-117.3                                        
##  3rd Qu.:-117.2                                        
##  Max.   :-117.1                                        
## 

We still have one variable ‘result’ with several values.

Creating new variables.

Let us create station groups using 2 first letters of a station, and compute how many stations are in each group.

df$stationgroup <- substr(df$stationcode, 1, 2)
unique(df$stationgroup)
##  [1] "EH" "EN" "FM" "IB" "MB" "OC" "PL" "SC" "SE" "TJ"
df$stationgroup <- factor(df$stationgroup)
# How many stations are in each station group?
tapply(df$stationcode, df$stationgroup, 
    FUN=function(x) length(unique(x)))
## EH EN FM IB MB OC PL SC SE TJ 
## 71  5 10  8 46 11 10  2  7  2

Subgroup Counts.

# How many records are for each station group?
table(df$stationgroup)
## 
##    EH    EN    FM    IB    MB    OC    PL    SC    SE    TJ 
## 61864 11180 15423 24010 33484 20771 17590   145 16326  1464
# How many records are for each analysis type?
table(df$analyte)
## 
## Coliform, Fecal Coliform, Total         E. coli    Enterococcus 
##           65827           66992            4428           65010
library(kableExtra)
# Some more table were created as well. 
kable(table(df[c('analyte','stationgroup')]), format='markdown')
EH EN FM IB MB OC PL SC SE TJ
Coliform, Fecal 19856 3728 5061 8046 10784 6893 5849 48 5360 202
Coliform, Total 20333 3729 5122 8044 10850 6949 5855 49 5437 624
E. coli 2047 2 437 150 967 108 26 0 170 521
Enterococcus 19628 3721 4803 7770 10883 6821 5860 48 5359 117
kable(table(df[c('analyte', 'unit')]), format='markdown',align='ccc')
cfu/100mL MPN/100 mL
Coliform, Fecal 19569 46258
Coliform, Total 19477 47515
E. coli 11 4417
Enterococcus 19854 45156
kable(table(df[c('methodname', 'analyte')]), format='markdown')
Coliform, Fecal Coliform, Total E. coli Enterococcus
Colilert-18 0 4486 4408 0
Enterolert 0 0 0 42372
EPA 1600 0 0 0 1591
MTF 19303 19412 9 4742
SM 9221 B 0 25578 0 0
SM 9221 E 28914 4 6 0
SM 9222 B 16011 17471 5 16304
SM 9222 D 1599 41 0 1
# How many records per station group are missing?
tapply(df$result, df$stationgroup, 
    FUN=function(col) sum(is.na(col)))
##   EH   EN   FM   IB   MB   OC   PL   SC   SE   TJ 
## 2035    2  410  187  853   77   47    0  167  208
# What are percentages of missing records?
tapply(df$result, df$stationgroup, 
    FUN=function(col) round(sum(is.na(col))/length(col), 4))
##     EH     EN     FM     IB     MB     OC     PL     SC     SE     TJ 
## 0.0329 0.0002 0.0266 0.0078 0.0255 0.0037 0.0027 0.0000 0.0102 0.1421
# How much is a total result for a group?
kable(aggregate(result~stationgroup, data=df, 
    FUN=function(x) sum(as.numeric(x), rm.na=T)))
stationgroup result
EH 97170118
EN 4525249
FM 26512491
IB 29887839
MB 12398505
OC 3825557
PL 1705724
SC 2718
SE 2119548
TJ 4322525094

Here the EH group has the most missing values, and TJ group had been recording the greatest pollution.

Maps of Stations

Please note that Google has restrictions on how many times you may request a map. If you need to run your markdown a few times I recommend to obtain your map and save it as a R object. Afterwards you can restore it and use as needed.

library(ggplot2)
library(ggmap)
## Request a map from Google
# san_diego_county <- get_googlemap(center = c(-117.35, 32.96),# midpoints
#                                maptype = "terrain",
#                                zoom = 9,
#                                size = c(640, 640),
#                                color = "color")
## Save an object to a file
# saveRDS(san_diego_county, file = "san_diego_county.rds")
## Restore the object
san_diego_county <- readRDS(file = "san_diego_county.rds")

ggmap(san_diego_county) +
           geom_point(data = df,
           aes(x = targetlongitude,
           y = targetlatitude,
          color = stationgroup),
                  shape =17,
               size = 2)

Or we can look at what happens near Mission Bay.

library(ggmap)
# san_diego_map <- get_googlemap(center = c(-117.22, 32.75),# "Mission Bay"
#                                maptype = "terrain",
#                                zoom = 12,
#                                size = c(640, 640),
#                                color = "color")
# # Save an object to a file
# saveRDS(san_diego_map, file = "san_diego_map.rds")
# Restore the object
san_diego_map <- readRDS(file = "san_diego_map.rds")
ggmap(san_diego_map) +
           geom_point(data = df,
           aes(x = targetlongitude,
           y = targetlatitude,
          color = stationgroup),
                  shape =17,
               size = 2)
## Warning: Removed 114144 rows containing missing values (geom_point).

See citation: D. Kahle and H. Wickham.

ggmap: Spatial Visualization with ggplot2. The R Journal, 5(1), 144-161. URL http://journal.r-project.org/archive/2013-1/kahle-wickham.pdf

Creating Columns with Dates and Time.

I will “paste” our date column and time column together.

library(lubridate)
df$date_time <- parse_date_time(paste(df$sampledate, df$collectiontime),
    orders = "ymd HMS")
df$sampledate = ymd(df$sampledate)
df$collectiontime <- hms(df$collectiontime)

Looking at average pollution by year for a station

I will show pollution values as sizes of station marks on a map. We have different kinds of pollution and different ways to measure them in different units, which may not translate into each other. You can see more detail about units here: http://www.cascadeanalytical.com/resources-downloads/faqs

At first we are to average our data by day because there are different number of measurments taken by a station.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
daily_df <-  df %>% select(stationcode, date_time, analyte, unit,result,
    stationgroup, sampledate, targetlatitude, targetlongitude) %>%
    group_by(date_time=floor_date(date_time, "day"), stationcode, stationgroup,
        analyte, unit, sampledate, targetlatitude, targetlongitude) %>%
   summarize(daily_mean = mean(result, na.rm=T))
head(daily_df)
## # A tibble: 6 x 9
## # Groups:   date_time, stationcode, stationgroup, analyte, unit,
## #   sampledate, targetlatitude [6]
##   date_time           stationcode stationgroup analyte unit  sampledate
##   <dttm>              <chr>       <fct>        <chr>   <chr> <date>    
## 1 1998-07-21 00:00:00 SE-010      SE           Colifo… MPN/… 1998-07-21
## 2 1998-07-21 00:00:00 SE-010      SE           Colifo… MPN/… 1998-07-21
## 3 1998-07-21 00:00:00 SE-010      SE           Entero… MPN/… 1998-07-21
## 4 1998-07-21 00:00:00 SE-020      SE           Colifo… MPN/… 1998-07-21
## 5 1998-07-21 00:00:00 SE-020      SE           Colifo… MPN/… 1998-07-21
## 6 1998-07-21 00:00:00 SE-020      SE           Entero… MPN/… 1998-07-21
## # ... with 3 more variables: targetlatitude <dbl>, targetlongitude <dbl>,
## #   daily_mean <dbl>

Now I create a table with average values for every station for each year.

yearlyResult <-  daily_df %>% select(date_time, daily_mean, stationgroup,
     unit, analyte, targetlatitude, targetlongitude) %>%
    group_by(year=floor_date(date_time, "year"), stationgroup,
        unit, analyte, targetlatitude, targetlongitude) %>%
   summarize(yearlyMean = mean(daily_mean, na.rm = T)) 
dim(yearlyResult)
## [1] 6984    7
options(width=100)
head(yearlyResult)
## # A tibble: 6 x 7
## # Groups:   year, stationgroup, unit, analyte, targetlatitude [6]
##   year                stationgroup unit     analyte       targetlatitude targetlongitude yearlyMean
##   <dttm>              <fct>        <chr>    <chr>                  <dbl>           <dbl>      <dbl>
## 1 1998-01-01 00:00:00 EN           cfu/100… Coliform, Fe…           33.1           -117.          1
## 2 1998-01-01 00:00:00 EN           cfu/100… Coliform, Fe…           33.1           -117.          1
## 3 1998-01-01 00:00:00 EN           cfu/100… Coliform, Fe…           33.1           -117.          1
## 4 1998-01-01 00:00:00 EN           cfu/100… Coliform, Fe…           33.1           -117.          1
## 5 1998-01-01 00:00:00 EN           cfu/100… Coliform, Fe…           33.1           -117.          1
## 6 1998-01-01 00:00:00 EN           cfu/100… Coliform, To…           33.1           -117.          1

Let us see what kind of ‘result’ values we got. First histogram shows a few huge values and a lot of smaller numbers, so applying logarithm function might help. Before taking a logarithm I added 1 to take care of zero values.

sum(is.na(yearlyResult$yearlyMean))
## [1] 49
hist(yearlyResult$yearlyMean, col="lightblue",
    main = " Histogram of Yearly Mean Values")

hist(log10(yearlyResult$yearlyMean+1), col="lightblue",
    main = " Histogram of Log(Yearly Mean Values+1)")

Applying logarithm function definetly helps. Here I prefer decimal logarithm because it’s more informative: an integer part of the value shows a number of digits before a period in its original value.

As we’ve seen in previous tables we have the most data for analyte==“Enterococcus” measured with units “MPN/100 mL”.

library(ggplot2)
library(ggmap)
stationMeans <- yearlyResult %>% filter(targetlatitude > 32.65, 
    targetlatitude < 32.85) %>%
    group_by(stationgroup, unit, analyte, 
        targetlatitude, targetlongitude) %>%
    summarize(logMeanPollution = mean(log10(yearlyMean+1), na.rm = T))
ggmap(san_diego_map) +
      geom_point(data =
              stationMeans[stationMeans$unit=="MPN/100 mL" &
                      stationMeans$analyte=="Enterococcus", ],
           aes(x = targetlongitude,
           y = targetlatitude,
          color = stationgroup,
               size = logMeanPollution), shape = 20) +
    ggtitle('Average Station Bacteria Counts for Enterococcus,
        Measured in MPN/100 mL')

Because our data is restricted to specific type of units and analyte we do not get all stations as before.

Note that we have range from 1 to 3 for values of logMeanPollution. It means that original values can differ by a multiple up to \(10^3 = 1000\).

Second most numerous data is for “Coliform, Total” measured in cfu/100mL.

library(ggplot2)
library(ggmap)
ggmap(san_diego_map) +
      geom_point(data =
              stationMeans[stationMeans$unit=="cfu/100mL" &
                      stationMeans$analyte=="Coliform, Total", ],
           aes(x = targetlongitude,
           y = targetlatitude,
          color = stationgroup,
               size = logMeanPollution), shape =20)+
    ggtitle('Average Station  Bacteria Counts for "Coliform, Total", 
        Measured in cfu/100mL')

For La Jolla area:

library(ggmap)
print("Here will be La Jolla area map when GoogleMaps will allow me to get one")
## [1] "Here will be La Jolla area map when GoogleMaps will allow me to get one"
# la_jolla_area <- get_googlemap(center = c(-117.25, 32.90),# "La Jolla area"
#                                maptype = "terrain",
#                                zoom = 12,
#                                size = c(640, 640),
#                                color = "color")
# # Save an object to a file
# saveRDS(la_jolla_area, file = "la_jolla_area.rds")
# # Restore the object
# la_jolla_area <- readRDS(file = "la_jolla_area.rds")
# stationMeans <- yearlyResult %>% filter(targetlatitude > 32.80,
#     targetlatitude < 33.05) %>%
#     group_by(stationgroup, unit, analyte,
#         targetlatitude, targetlongitude) %>%
#     summarize(logMeanPollution = mean(log10(yearlyMean+1), na.rm = T))
# ggmap(san_diego_map) +
#       geom_point(data =
#               stationMeans[stationMeans$unit=="cfu/100mL" &
#                       stationMeans$analyte=="Enterococcus", ],
#            aes(x = targetlongitude,
#            y = targetlatitude,
#           color = stationgroup,
#                size = logMeanPollution), shape = 20) +
#     ggtitle('Average Station Pollution Values for Enterococcus,
#         Measured as cfu/100mL')
# ggmap(la_jolla_area) +
#       geom_point(data =
#               stationMeans[stationMeans$unit=="cfu/100mL" &
#                       stationMeans$analyte=="Enterococcus", ],
#            aes(x = targetlongitude,
#            y = targetlatitude,
#           color = stationgroup,
#                size = logMeanPollution), shape = 20) +
#     ggtitle('Average Station Bacteria Counts for Enterococcus,
#         Measured as cfu/100mL')